Load required packages

library(plyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(dplyr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4

Step 1

Load indeed dataset from github project repository.

Step2 Tidying the data

Removing the radius column from read_indeed_url

read_indeed_url<-read_indeed_url[,-5]
head(read_indeed_url)

Rename the columns in read_indeed_url

names(read_indeed_url)<-c('Source', 'Job Title','Skills','City','url','Count')
head(read_indeed_url)

Removing NA value in the read_indeed_url data frame

read_indeed_url<-na.omit(read_indeed_url)
head(read_indeed_url)

Sorting the ny_indeed data frame by count

ny_indeed <- ny_indeed %>% arrange(desc(ny_indeed$count))
## Warning: package 'bindrcpp' was built under R version 3.4.4

Aggregate the skills to get the frequency

indeed_skillaggr<-aggregate(read_indeed_url$Count,by=list(Category=read_indeed_url$Skills), FUN=sum)
indeed_skillaggr

Jobs by skills

skills_count<-read_indeed_url %>%
  group_by(Skills) %>%
  summarise(Total=sum(Count)) %>%
  arrange(desc(Total))

skills_count

Jobs opening by City

skills_city<-read_indeed_url %>%
  group_by(Skills,City) %>%
  summarise(Total=sum(Count)) %>%
  arrange(desc(Total))

skills_city

Grouping ny_indeed dataset by type

grpd <- ny_indeed %>% 
  group_by(type) %>% 
  select(type,count) %>% 
  summarise(sum_by_type = sum(count))

Step3 Data Visualization

plots_top<-tail(skills_count,10)

#ggplot(plots_top, aes(plots_top$Skills, plots_top$Total)) + geom_bar(stat="identity")

darkcols <- brewer.pal(8,"Dark2")
names <- plots_top$Skills
barplot(plots_top$Total,main="Indeed Counts", horiz=TRUE, names.arg=names, las=1, col=darkcols, cex.axis=0.5, cex.names = 0.5)

top10_skills<-skills_city[1:10,]
ggplot(top10_skills, aes(x=Skills, y=Total, colour= City, size = Total)) + geom_point()

library(wordcloud)
wordcloud(skills_count$Skills,skills_count$Total, random.order=FALSE, colors=brewer.pal(8,"Dark2"))
## Warning in wordcloud(skills_count$Skills, skills_count$Total, random.order
## = FALSE, : Machine Learning could not be fit on page. It will not be
## plotted.

Drilling down on the Data Scientist jobs in NY. Lets look at a horizontal bar chart of all skills with type indicated by the bar’s color.

ny_indeed$key_words <- factor(ny_indeed$key_words, levels = unique(ny_indeed$key_words)[order(ny_indeed$count, decreasing = F)])
m <- list(
  l = 100,
  r = 100,
  b = 100,
  t = 100,
  pad = 4
)
key_word_plot <- plot_ly(data = ny_indeed, x= ~count, y = ~key_words, type = 'bar', orientation = 'h', color = ~type) %>% 
  layout(title='Skills Required of Data Scientists in NY')

key_word_plot

Now lets look at which type of skill was mentioned the most in job descriptions by plotting the aggregated data.

grpd$type <- factor(grpd$type, levels = unique(grpd$type)[order(grpd$sum_by_type, decreasing = F)])
sum_by_type <- plot_ly(data = grpd, x=~sum_by_type, y=~type, type = 'bar', orientation = 'h', color = ~type) %>% 
  layout(title='NY Skills by Type')

sum_by_type  

# Step4 Conclusion

The top 5 hard skills are oriented towards Big Data, Python, R are the word most used in the dataset. Our finding show a few skills underlie the field of data science. The importance of an education in Mathematics is clear by the number of mentions in the job descriptions of Data Scientist. We cannot conclude definitely about on demand skills since some other skills are missing in the dataset.